library(ggplot2)

setwd("/Users/ab6415/Documents/Work_laptop_stuff/Bioinformatics/Analysis/Beltran_et_al_2020_NEE/03_04_Figures1-2/epimutation_clustering/")
epimut_data<-read.table("../../03_04_Figures1-2/k-means_segmentation_epimutable_genes.txt")
epimut_genes_noiso<-read.table("epimutable_genes.noiso.txt"); epimut_genes_noiso<-epimut_genes_noiso$V1


gene_order<-read.table("cel_geneorder_numbered.txt"); colnames(gene_order)<-c("chr","cosmidID","order_pos")
gene_positions<-read.table("cel_genepos_and_cosmidID.bed"); colnames(gene_positions)<-c("chr","start","end","cosmidID")

ttg_normcounts<-read.table("../../02_Normalised_counts/22G_DEseqnorm_counts_averaged.txt")

gene_order_and_positions<-merge(gene_order,gene_positions[,c("start","end","cosmidID")],by="cosmidID")
gene_order_and_positions<-gene_order_and_positions[-which(duplicated(gene_order_and_positions$cosmidID)),]
gene_order_and_positions<-gene_order_and_positions[order(gene_order_and_positions$order_pos),]
gene_order_and_positions$meanpos<-(gene_order_and_positions$start+gene_order_and_positions$end)/2

iso_to_noiso<-data.frame(iso=rownames(epimut_data),noiso=epimut_genes_noiso)


order_distributions<-function(gene_set){
  gene_order<-c(diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="I"),"order_pos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="II"),"order_pos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="III"),"order_pos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="IV"),"order_pos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="V"),"order_pos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="X"),"order_pos"]))
  return(gene_order)
}

distance_distributions<-function(gene_set){
  distances<-c(diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="I"),"meanpos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="II"),"meanpos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="III"),"meanpos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="IV"),"meanpos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="V"),"meanpos"]),
                diff(gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% gene_set & gene_order_and_positions$chr=="X"),"meanpos"]))
  return(distances)
}

Here I calculate the number of genes separating epimutable genes in the linear genome - neighbouring genes are at a distance of 1, etc. I see a clear excess of neighbouring genes compared to randomly selected genes - in support of clustering.

hist(order_distributions(epimut_genes_noiso),breaks=seq(1000))

length(which(order_distributions(epimut_genes_noiso)==1))
## [1] 30
median(distance_distributions(epimut_genes_noiso))
## [1] 148452.2
length(order_distributions(epimut_genes_noiso))
## [1] 390
length(which(epimut_genes_noiso %in% gene_order_and_positions$cosmidID))
## [1] 396
random_sets_numneighbors<-rep(0,10000)
random_sets_distances<-rep(0,10000)
for (i in seq(10000)){
  rand_set<-sample(gene_order_and_positions$cosmidID,size = 396,replace = FALSE)
  random_sets_numneighbors[i]<-length(which(order_distributions(rand_set)==1))
  random_sets_distances[i]<-median(distance_distributions(rand_set))
}

order_data<-data.frame(distance=c(order_distributions(epimut_genes_noiso),order_distributions(rand_set)),
                          data=c(rep("real",length(order_distributions(epimut_genes_noiso))),rep("simulated",length(order_distributions(rand_set)))))
ggplot(order_data)+geom_bar(aes(x=distance,fill=data),position ="dodge")+coord_cartesian(xlim=c(0,100))

library(ggplot2)
ggplot(data.frame(random_sets_numneighbors))+geom_bar(aes(x=random_sets_numneighbors))+
  coord_cartesian(xlim=c(-1,32))+
  geom_vline(xintercept = 30,col="red",linetype="dashed")+
  xlab("number of neighbouring epimutable gene pairs")+
  theme_classic()

ggsave("epimutable_gene_pairs_observed_vs_1e4randomsets.pdf")
## Saving 7 x 5 in image

We can see this type of effect also when considering the genomic distances (bp) between epimutable genes.

ggplot(data.frame(random_sets_distances))+geom_histogram(aes(x=random_sets_distances))+
  coord_cartesian(xlim=c(100000,200000))+
  geom_vline(xintercept = median(distance_distributions(epimut_genes_noiso)),col="red",linetype="dashed")+
  xlab("median distance between consecutive epimutable genes")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#distance distribution from real data
ggplot(data.frame(distance_distributions(epimut_genes_noiso)))+geom_histogram(aes(x=distance_distributions.epimut_genes_noiso.),breaks=seq(0,2.6e6,by = 1.3e4))+
  coord_cartesian(xlim=c(0,2.6e6),ylim=c(0,50))

#distance distribution from random set
ggplot(data.frame(distance_distributions(rand_set)))+geom_histogram(aes(x=distance_distributions.rand_set.),breaks=seq(0,2.6e6,by = 1.3e4))+
  coord_cartesian(xlim=c(0,2.6e6),ylim=c(0,50))

We see a clear enrichment of neighbouring epimutable genes relative to what would be expected by chance - suggesting that spatial clustering effects exist in the system. Similarly, the median distance between neighbouring epimutable genes is smaller than would be expected by chance, and comparing the distance distributions between epimutable genes and genes selected randomly shows that epimutable genes are enriched in the lower range of distances.

So with this we can conclude than genes subjected to epimutation tend to cluster more than expected - now the question is, do they epimutate in a coordinated manner?

To test this, I will select neighbouring epimutable genes and do the following test:

Also a Pearson correlation test was applied to the count data of pairs of genes across samples –> quantify number of genes with significant r>0.75 correlations

#with the full dataset

epimut_genes_noiso
##   [1] 3R5.1        AC3.5        B0001.5      B0035.7      B0261.8     
##   [6] B0280.15     B0284.2      B0285.7      B0348.5      B0350.2     
##  [11] B0416.6      B0454.9      B0457.4      B0511.11     B0511.3     
##  [16] B0511.4      B0511.5      B0513.4      B0554.7      C01B7.6     
##  [21] C01B9.1      C02F5.10     C03D6.1      C03E10.3     C03G6.12    
##  [26] C04F6.1      C05D11.13    C05D2.8      C06A1.3      C06A5.12    
##  [31] C06B8.7      C06E4.5      C06H2.4      C07G1.6      C07G1.7     
##  [36] C07G3.9      C08B11.4     C08E3.2      C08E3.6      C08E8.7     
##  [41] C08F11.7     C09G5.7      C13B9.1      C14F5.3      C15B12.1    
##  [46] C16C10.11    C16C8.20     C16C8.21     C17F4.7      C18C4.17    
##  [51] C18D4.2      C18D4.6      C24H10.4     C24H12.6     C25D7.15    
##  [56] C26C6.3      C27D8.1      C27D8.2      C27D8.3      C27H5.2     
##  [61] C28A5.4      C28A5.6      C28C12.1     C28F5.1      C30F12.5    
##  [66] C30G12.2     C30G12.6     C30G7.3      C34C12.6     C34C12.9    
##  [71] C34D4.1      C35B1.5      C35C5.1      C36B1.9      C36B7.6     
##  [76] C36C9.3      C36F7.1      C36F7.5      C37A5.9      C38D9.1     
##  [81] C38D9.3      C38H2.3      C39B5.3      C40A11.7     C41C4.1     
##  [86] C42D4.19     C43D7.10     C43D7.11     C44B7.11     C44H9.4     
##  [91] C45E1.1      C45G7.13     C46A5.8      C46F11.6     C46G7.3     
##  [96] C47B2.8      C48B6.9      C48D1.6      C49F5.1      C49G7.11    
## [101] C53C7.5      C54E10.1     C55C3.3      C55C3.7      C56G7.3     
## [106] D1005.3      D1037.3      D1054.5      D2024.1      D2024.18    
## [111] D2085.1      F01D4.2      F01F1.3      F07A11.1     F07B7.12    
## [116] F07D10.1     F07G6.7      F07H5.5      F08A8.2      F08F8.3     
## [121] F08G12.4     F08G5.3      F09B12.3     F11A5.18     F13B12.7    
## [126] F13H8.7      F13H8.8      F14F4.3      F16B4.6      F17B5.1     
## [131] F19H6.5      F20D1.1      F21D5.4      F21D9.7      F22B7.9     
## [136] F22F1.2      F22H10.10    F23A7.7      F23B12.4     F23C8.8     
## [141] F25G6.1      F26D10.9     F28H7.3      F31C3.11     F31E9.6     
## [146] F32A7.6      F32A7.7      F32D8.1      F33D11.8     F33E2.3     
## [151] F33H12.1     F35C5.6      F35H10.5     F36A2.3      F36G9.4     
## [156] F36H5.11     F37B4.2      F38A1.5      F39C12.1     F39D8.7     
## [161] F39E9.6      F39G3.1      F40A3.5      F40D4.16     F40D4.17    
## [166] F42C5.6      F42F12.3     F42G4.6      F42G4.7      F42G8.11    
## [171] F42G8.4      F42G8.5      F44G3.6      F45C12.8     F45H10.1    
## [176] F46C3.1      F46F5.3      F46F5.5      F47A4.2      F47A4.5     
## [181] F47B8.6      F47B8.7      F47B8.8      F47F6.1      F48E3.9     
## [186] F49B2.3      F49C12.15    F49C5.4      F49E11.2     F49E7.1     
## [191] F52C9.1      F53A3.1      F53B6.2      F53B6.6      F53B6.7     
## [196] F54C9.4      F54D10.2     F54F7.6      F54F7.7      F55A4.4     
## [201] F55C9.13     F55C9.14     F55C9.7      F55C9.8      F55G1.10    
## [206] F55G11.8     F55H12.3     F56B3.2      F56D12.4     F56D2.3     
## [211] F56D6.15     F57G4.4      F57G4.8      F58B3.1      F58E1.13    
## [216] F58E10.7     F58G6.3      F59A1.9      F59A7.7      F59E12.3    
## [221] H05C05.4     H06O01.4     H14A12.5     H14E04.3     H19M22.2    
## [226] K01D12.1     K02C4.4      K06C4.17     K07E8.10     K07H8.6     
## [231] K08C9.7      K08D12.4     K08F8.6      K09A9.4      K09E3.6     
## [236] K09F6.6      K09H9.8      K10B3.10     K10D2.8      K10F12.4    
## [241] K10G9.2      K11C4.3      K11D2.1      K11D2.2      K12B6.9     
## [246] M01E10.2     M01E11.4     M03A8.2      M04B2.6      M04B2.8     
## [251] M04G12.2     R01H10.6     R02F2.2      R04B5.9      R06A10.1    
## [256] R06C1.4      R07B1.10     R07E5.4      R07H5.4      R08E3.3     
## [261] R102.4       R10E4.3      R10F2.1      R11E3.3      R12B2.1     
## [266] R151.1       R186.1       T02B11.7     T02G5.4      T03F6.1     
## [271] T04D1.4      T05C3.2      T05C3.6      T05D4.1      T05E11.9    
## [276] T05G5.4      T07D4.7      T08B2.4      T10A3.1      T10C6.12    
## [281] T10D4.3      T10D4.5      T11G6.7      T12B5.1      T12B5.2     
## [286] T12B5.6      T13H5.3      T14G10.8     T14G11.3     T16G12.4    
## [291] T16H12.2     T20F10.7     T20F5.5      T20F7.6      T20H4.2     
## [296] T22A3.4      T23D8.6      T23F6.1      T23F6.3      T23G11.1    
## [301] T23G11.11    T24A11.3     T25C12.3     T25D3.3      T27F2.4     
## [306] T28B8.1      T28D6.4      T28F2.1      T28F3.4      T28H10.3    
## [311] VW06B3R.1    VY10G11R.1   W01A11.3     W01A11.6     W01A11.7    
## [316] W02A2.1      W02B8.3      W02D9.10     W03C9.6      W03D2.5     
## [321] W03F11.5     W04A8.2      W04B5.3      W05B2.2      W05F2.6     
## [326] W05H9.3      W06A11.4     W09D6.5      Y102A11A.3   Y102A5C.6   
## [331] Y102A5C.8    Y105C5A.1269 Y110A7A.20   Y116A8C.18   Y116F11A.1  
## [336] Y119D3B.21   Y119D3B.8    Y17D7B.10    Y17D7B.6     Y17D7B.8    
## [341] Y17D7C.3     Y17D7C.4     Y18H1A.14    Y20F4.2      Y22F5A.5    
## [346] Y26G10.5     Y34F4.5      Y37E11AL.1   Y37E3.5      Y37E3.7     
## [351] Y37H2B.1     Y38F2AL.12   Y38F2AR.10   Y39A3CL.2    Y39A3CR.8   
## [356] Y39B6A.11    Y39B6A.41    Y39B6A.9     Y39D8B.3     Y39G10AR.17 
## [361] Y40A1A.4     Y43F8B.22    Y43F8B.8     Y43F8B.9     Y44E3A.2    
## [366] Y45F10D.10   Y46C8AL.8    Y47D7A.16    Y47H10A.1    Y47H9C.10   
## [371] Y48G10A.7    Y48G1C.5     Y49A3A.4     Y49F6A.10    Y49F6A.5    
## [376] Y50E8A.14    Y53C10A.3    Y53F4B.16    Y53F4B.17    Y53F4B.45   
## [381] Y53G8AM.1    Y53H1B.1     Y54E2A.4     Y54F10AM.11  Y54F10BM.15 
## [386] Y54G2A.44    Y55F3AM.2    Y57G11C.1130 Y60A3A.24    Y60A3A.27   
## [391] Y62E10A.3    Y62E10A.4    Y65B4BL.7    Y67D8A.3     Y68A4A.13   
## [396] Y69A2AR.14   Y69A2AR.31   Y71F9AL.5    Y71G12A.3    Y71G12B.11  
## [401] Y71H2AL.2    Y73F8A.32    Y75D11A.5    Y75D11A.7    Y76A2B.2    
## [406] Y76B12C.4    Y7A5A.21     Y7A5A.22     Y80D3A.1     Y81G3A.4    
## [411] Y82E9BL.18   Y82E9BR.4    Y87G2A.2     Y8G1A.2      Y94H6A.12   
## [416] Y94H6A.7     Y95B8A.1     Y95B8A.10    ZC123.4      ZC15.1      
## [421] ZC190.7      ZC190.8      ZC204.2      ZC317.6      ZC328.5     
## [426] ZC416.1      ZC84.3       ZK1098.6     ZK1098.7     ZK1128.7    
## [431] ZK1151.1     ZK1236.9     ZK131.5      ZK177.9      ZK228.1     
## [436] ZK250.15     ZK355.2      ZK673.4      ZK856.5      ZK909.5     
## [441] ZK945.1      ZK973.6     
## 442 Levels: 3R5.1 AC3.5 B0001.5 B0035.7 B0261.8 B0280.15 B0284.2 ... ZK973.6
gene_order_and_positions_epimutable<-gene_order_and_positions[which(gene_order_and_positions$cosmidID %in% epimut_genes_noiso),]
gene_order_and_positions_epimutable[sort(unique(c(which(diff(gene_order_and_positions_epimutable$order_pos)==1),
                                            which(diff(gene_order_and_positions_epimutable$order_pos)==1)+1))),]
##        cosmidID chr order_pos    start      end  meanpos
## 30148 T23G11.11   I      1803  7701120  7701389  7701254
## 30146  T23G11.1   I      1804  7701364  7702053  7701708
## 1104    B0511.4   I      2708 10644244 10645617 10644930
## 1103    B0511.3   I      2709 10646761 10647777 10647269
## 1092   B0511.11   I      2710 10648605 10654179 10651392
## 23587   K11D2.1   I      3256 12497466 12505010 12501238
## 23589   K11D2.2   I      3257 12505027 12507569 12506298
## 14787   F32A7.6   I      3798 14850053 14855827 14852940
## 14788   F32A7.7   I      3799 14852817 14854607 14853712
## 4178   C16C8.20  II      4945  3446956  3448020  3447488
## 4179   C16C8.21  II      4946  3448205  3448759  3448482
## 5951   C30G12.2  II      6296  7290372  7291882  7291127
## 5953   C30G12.6  II      6297  7292191  7296015  7294103
## 36555 Y53F4B.45  II      8520 15042804 15044011 15043408
## 36511 Y53F4B.16  II      8521 15052084 15055718 15053901
## 36512 Y53F4B.17  II      8522 15058226 15062057 15060142
## 28657   T12B5.2 III      8781   956333   957947   957140
## 28649   T12B5.1 III      8782   962303   963679   962991
## 9580    D2024.1  IV     13989  7247542  7249437  7248490
## 9585   D2024.18  IV     13990  7250457  7254204  7252330
## 2521    C06E4.5  IV     13991  7254989  7255909  7255449
## 16715   F42G8.5  IV     14290  8136324  8139087  8137706
## 16712   F42G8.4  IV     14291  8139218  8143137  8141178
## 2758    C07G1.6  IV     14312  8209157  8210400  8209778
## 2759    C07G1.7  IV     14313  8210953  8211828  8211390
## 5517    C27D8.3  IV     15768 12706713 12710499 12708606
## 5516    C27D8.2  IV     15769 12712764 12714771 12713768
## 5515    C27D8.1  IV     15770 12717830 12719391 12718610
## 30137   T23F6.1  IV     15771 12719959 12720997 12720478
## 37611 Y62E10A.3  IV     15932 13359470 13360285 13359878
## 37612 Y62E10A.4  IV     15933 13365168 13368435 13366802
## 27428   T05C3.6   V     18122  5490369  5492261  5491315
## 27424   T05C3.2   V     18123  5493746  5503010  5498378
## 31261  W01A11.6   V     18482  6502368  6504181  6503274
## 31262  W01A11.7   V     18483  6504537  6505500  6505018
## 39377   ZC190.7   V     19246  8686070  8686864  8686467
## 39378   ZC190.8   V     19247  8686957  8687807  8687382
## 17711   F47B8.6   V     21491 14329068 14330080 14329574
## 17713   F47B8.8   V     21492 14331673 14332667 14332170
## 17712   F47B8.7   V     21493 14333415 14334803 14334109
## 33401  Y17D7B.6   V     22876 18771689 18772639 18772164
## 33402  Y17D7B.8   V     22877 18773060 18774024 18773542
## 19447   F55C9.7   V     22997 19297809 19298749 19298279
## 19441  F55C9.14   V     22998 19299561 19300461 19300011
## 19448   F55C9.8   V     22999 19301075 19302034 19301554
## 19440  F55C9.13   V     23000 19302786 19303728 19303257
## 35142  Y43F8B.9   V     23057 19479202 19480459 19479830
## 35127 Y43F8B.22   V     23058 19482834 19482984 19482909
## 19189   F54F7.6   X     27320 11865382 11866827 11866104
## 19190   F54F7.7   X     27321 11867970 11869360 11868665
## 8329    C49F5.1   X     27356 11965961 11968109 11967035
## 31214 VW06B3R.1   X     27357 11968033 12021209 11994621
#52 genes in total
#16 doublets
#4 triplets
#2 quadruplets

data_for_geomcol<-data.frame(count=c(16,4,2),size=factor(c("pairs","triplets","quadruplets"),levels=c("pairs","triplets","quadruplets")))
ggplot(data_for_geomcol)+geom_col(aes(y=count,x=size))+theme_classic()

ggsave("epimutable_gene_pairs_groupsizes.pdf")
## Saving 7 x 5 in image
#analysis by pairs (30 pairs)
gene1<-gene_order_and_positions_epimutable$cosmidID[which(diff(gene_order_and_positions_epimutable$order_pos)==1)]
gene2<-gene_order_and_positions_epimutable$cosmidID[which(diff(gene_order_and_positions_epimutable$order_pos)==1)+1]

gene1_iso<-c()
for (gene in gene1){
  gene1_iso<-c(gene1_iso,as.character(iso_to_noiso[which(iso_to_noiso$noiso==gene),"iso"]))
}
gene2_iso<-c()
for (gene in gene2){
  gene2_iso<-c(gene2_iso,as.character(iso_to_noiso[which(iso_to_noiso$noiso==gene),"iso"]))
}

test_coepimutation<-function(gene1,gene2,epimut_data,ttg_normcounts){
  
  states_gene1<-epimut_data[toString(gene1),]
  states_gene2<-epimut_data[toString(gene2),]
  matching_states<-length(states_gene1)-sum(abs(states_gene1-states_gene2))
  
  counts_gene1<-as.numeric(ttg_normcounts[toString(gene1),])
  counts_gene2<-as.numeric(ttg_normcounts[toString(gene2),])
  corr<-cor.test(counts_gene1,counts_gene2)
  
  expected_matching_states<-rep(0,10000)
  expected_corrs<-rep(0,10000)
  for (i in seq(10000)){
    random_states_gene1<-epimut_data[toString(gene1),][sample(seq(length(epimut_data[toString(gene1),])),size = length(epimut_data[toString(gene1),]),replace = FALSE)]
    random_states_gene2<-epimut_data[toString(gene2),][sample(seq(length(epimut_data[toString(gene2),])),size = length(epimut_data[toString(gene2),]),replace = FALSE)]
    expected_matching_states[i]<-(length(random_states_gene1)-sum(abs(random_states_gene1-random_states_gene2)))
    
    random_counts_gene1<-as.numeric(ttg_normcounts[toString(gene1),][sample(seq(length(epimut_data[toString(gene1),])),size = length(epimut_data[toString(gene1),]),replace = FALSE)])
    random_counts_gene2<-as.numeric(ttg_normcounts[toString(gene2),][sample(seq(length(epimut_data[toString(gene2),])),size = length(epimut_data[toString(gene2),]),replace = FALSE)])
    expected_corrs[i]<-cor(random_counts_gene1,random_counts_gene2)
    }
  
  return(c(toString(gene1),length(which(states_gene1==2)),toString(gene2),length(which(states_gene2==2)),matching_states,ncol(epimut_data),(length(which(expected_matching_states>=matching_states))+1)/length(expected_matching_states),as.numeric(corr$estimate),corr$p.value,(length(which(expected_corrs>=corr$estimate))+1)/length(expected_corrs)))
}


#example
test_coepimutation(gene1_iso[1],gene2_iso[1],epimut_data,ttg_normcounts)
##  [1] "T23G11.11"          "13"                 "T23G11.1"          
##  [4] "18"                 "32"                 "47"                
##  [7] "0.044"              "0.294758589588205"  "0.0442960257556428"
## [10] "0.0307"
test_coepimutation_results<-c()
for (i in seq(length(gene1_iso))){
  test_coepimutation_results<-rbind(test_coepimutation_results,test_coepimutation(gene1_iso[i],gene2_iso[i],epimut_data,ttg_normcounts))
}
colnames(test_coepimutation_results)<-c("gene1","gene1_up","gene2","gene2_up","matching states","total samples","sim_pvalue","pearson_r","pearson_pvalue","corrsim_pvalue")
test_coepimutation_results<-data.frame(test_coepimutation_results)

#sim test
test_coepimutation_results$sim_pvalue<-as.numeric(as.character(test_coepimutation_results$sim_pvalue))
test_coepimutation_results$sim_fdr<-p.adjust(test_coepimutation_results$sim_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results))+geom_histogram(aes(x=sim_fdr))+ggtitle("simulation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

length(which(test_coepimutation_results$sim_fdr<0.1))
## [1] 13
length(which(test_coepimutation_results$sim_fdr<0.2))
## [1] 17
#pearson test
test_coepimutation_results$pearson_pvalue<-as.numeric(as.character(test_coepimutation_results$pearson_pvalue))
test_coepimutation_results$pearson_fdr<-p.adjust(test_coepimutation_results$pearson_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results))+geom_histogram(aes(x=pearson_fdr))+ggtitle("Pearson correlation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

length(which(test_coepimutation_results$pearson_fdr<0.1 & as.numeric(as.character(test_coepimutation_results$pearson_r>0.75))))
## Warning in Ops.factor(test_coepimutation_results$pearson_r, 0.75): '>' not
## meaningful for factors
## [1] 0
#correlation sim test
test_coepimutation_results$corrsim_pvalue<-as.numeric(as.character(test_coepimutation_results$corrsim_pvalue))
test_coepimutation_results$corrsim_fdr<-p.adjust(test_coepimutation_results$corrsim_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results))+geom_histogram(aes(x=corrsim_fdr))+ggtitle("correlation simulation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

length(which(test_coepimutation_results$corrsim_fdr<0.1))
## [1] 25
length(which(test_coepimutation_results$corrsim_fdr<0.2))
## [1] 25
test_coepimutation_results
##        gene1 gene1_up      gene2 gene2_up matching.states total.samples
## 1  T23G11.11       13   T23G11.1       18              32            47
## 2    B0511.4        3    B0511.3        3              45            47
## 3    B0511.3        3   B0511.11        1              43            47
## 4   K11D2.1b       21  K11D2.2.1       21              31            47
## 5    F32A7.6        1    F32A7.7        6              42            47
## 6   C16C8.20        8   C16C8.21       11              44            47
## 7   C30G12.2        1  C30G12.6a        1              45            47
## 8  Y53F4B.45        9  Y53F4B.16       15              23            47
## 9  Y53F4B.16       15  Y53F4B.17       16              38            47
## 10   T12B5.2        4    T12B5.1        8              43            47
## 11   D2024.1        1   D2024.18       12              36            47
## 12  D2024.18       12    C06E4.5       12              43            47
## 13   F42G8.5        6 F42G8.4b.1       20              25            47
## 14   C07G1.6        1    C07G1.7        2              46            47
## 15  C27D8.3b       29    C27D8.2        2              16            47
## 16   C27D8.2        2    C27D8.1        1              46            47
## 17   C27D8.1        1    T23F6.1        1              47            47
## 18 Y62E10A.3       11 Y62E10A.4a       14              38            47
## 19  T05C3.6a        3    T05C3.2        4              44            47
## 20  W01A11.6        5 W01A11.7.2        1              43            47
## 21   ZC190.7        1    ZC190.8        3              45            47
## 22   F47B8.6        1    F47B8.8        1              47            47
## 23   F47B8.8        1    F47B8.7        1              47            47
## 24  Y17D7B.6        4   Y17D7B.8        1              44            47
## 25   F55C9.7        6   F55C9.14        1              42            47
## 26  F55C9.14        1    F55C9.8       18              30            47
## 27   F55C9.8       18   F55C9.13        1              30            47
## 28  Y43F8B.9        1  Y43F8B.22       10              38            47
## 29 F54F7.6.2       16    F54F7.7        1              30            47
## 30 C49F5.1.3        2 VW06B3R.1b        9              36            47
##    sim_pvalue          pearson_r pearson_pvalue corrsim_pvalue    sim_fdr
## 1      0.0498  0.294758589588205   4.429603e-02         0.0285 0.10671429
## 2      0.0096  0.837538766022239   2.154051e-13         0.0001 0.04800000
## 3      1.0001  0.182310208223178   2.200067e-01         0.0800 1.00000000
## 4      0.0335  0.394822541196271   6.024619e-03         0.0035 0.09136364
## 5      0.1328  0.349537422325746   1.602753e-02         0.0311 0.20968421
## 6      0.0001  0.921168187789753   4.602067e-20         0.0001 0.00100000
## 7      1.0001 0.0220483474951594   8.830504e-01         0.3027 1.00000000
## 8      1.0001 -0.208118945975963   1.603794e-01         0.9278 1.00000000
## 9      0.0001  0.780757778649914   9.639870e-11         0.0001 0.00100000
## 10     0.0007  0.798455033274307   1.774885e-11         0.0001 0.00480000
## 11     0.2647  0.908508458714786   1.146427e-18         0.0001 0.37814286
## 12     0.0001  0.833955391221065   3.381609e-13         0.0001 0.00100000
## 13     0.8229  0.214568326315581   1.475260e-01         0.0809 1.00000000
## 14     0.0391   0.92034233274529   5.767113e-20         0.0001 0.09346154
## 15     1.0001  -0.19817663194178   1.817671e-01         0.9099 1.00000000
## 16     0.0405  0.867701452971962   2.957836e-15         0.0001 0.09346154
## 17     0.0226  0.888045891427894   8.624323e-17         0.0009 0.06780000
## 18     0.0008  0.442047628990477   1.865218e-03         0.0027 0.00480000
## 19     0.0164  0.812801021775346   3.962912e-12         0.0001 0.06780000
## 20     0.1103   0.17607982125463   2.364470e-01         0.0321 0.19464706
## 21     0.0649  0.978724970260119   1.341549e-32         0.0001 0.12980000
## 22     0.0219   0.99999438528493  5.080949e-113         0.0002 0.06780000
## 23     0.0220  0.999989508092554  6.535434e-107         0.0206 0.06780000
## 24     0.0874  0.945280496299038   1.609033e-23         0.0001 0.16387500
## 25     0.1233  0.646397463483123   9.204028e-07         0.0001 0.20550000
## 26     0.3778  0.665442199906775   3.340901e-07         0.0025 0.50269565
## 27     0.3854  0.689153001257861   8.506693e-08         0.0041 0.50269565
## 28     0.2149  0.264792562954239   7.206587e-02         0.0598 0.32235000
## 29     1.0001 -0.244498023159829   9.766050e-02         0.9638 1.00000000
## 30     1.0001 -0.133434148988213   3.712399e-01         0.8512 1.00000000
##      pearson_fdr  corrsim_fdr
## 1   6.328004e-02 0.0427500000
## 2   6.462154e-13 0.0002500000
## 3   2.444519e-01 0.0970800000
## 4   9.512556e-03 0.0061764706
## 5   2.404129e-02 0.0437727273
## 6   2.761240e-19 0.0002500000
## 7   8.830504e-01 0.3492692308
## 8   1.924553e-01 0.9597931034
## 9   2.065686e-10 0.0002500000
## 10  4.095889e-11 0.0002500000
## 11  4.913257e-18 0.0002500000
## 12  9.222570e-13 0.0002500000
## 13  1.844075e-01 0.0970800000
## 14  2.883557e-19 0.0002500000
## 15  2.097312e-01 0.9597931034
## 16  9.859455e-15 0.0002500000
## 17  3.234121e-16 0.0019285714
## 18  3.108697e-03 0.0050625000
## 19  9.907280e-12 0.0002500000
## 20  2.533361e-01 0.0437727273
## 21  1.341549e-31 0.0002500000
## 22 1.524285e-111 0.0004615385
## 23 9.803150e-106 0.0325263158
## 24  1.206775e-22 0.0002500000
## 25  1.624240e-06 0.0002500000
## 26  6.264189e-07 0.0050000000
## 27  1.701339e-07 0.0068333333
## 28  9.827165e-02 0.0780000000
## 29  1.273833e-01 0.9638000000
## 30  3.840413e-01 0.9457777778

#with data from gen 25-100 only

epimut_data_gens25_100<-epimut_data[,25:47]
ttg_normcounts_gens25_100<-ttg_normcounts[,25:47]

test_coepimutation_results_gens25_100<-c()
for (i in seq(length(gene1_iso))){
  test_coepimutation_results_gens25_100<-rbind(test_coepimutation_results_gens25_100,test_coepimutation(gene1_iso[i],gene2_iso[i],epimut_data_gens25_100,ttg_normcounts_gens25_100))
}
colnames(test_coepimutation_results_gens25_100)<-c("gene1","gene1_up","gene2","gene2_up","matching states","total samples","sim_pvalue","pearson_r","pearson_pvalue","corrsim_pvalue")
test_coepimutation_results_gens25_100<-data.frame(test_coepimutation_results_gens25_100)


#sim test
test_coepimutation_results_gens25_100$sim_pvalue<-as.numeric(as.character(test_coepimutation_results_gens25_100$sim_pvalue))
test_coepimutation_results_gens25_100$sim_fdr<-p.adjust(test_coepimutation_results_gens25_100$sim_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results_gens25_100))+geom_histogram(aes(x=sim_fdr))+ggtitle("simulation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

length(which(test_coepimutation_results_gens25_100$sim_fdr<0.1))
## [1] 4
length(which(test_coepimutation_results_gens25_100$sim_fdr<0.2))
## [1] 9
#pearson test
test_coepimutation_results_gens25_100$pearson_pvalue<-as.numeric(as.character(test_coepimutation_results_gens25_100$pearson_pvalue))
test_coepimutation_results_gens25_100$pearson_fdr<-p.adjust(test_coepimutation_results_gens25_100$pearson_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results_gens25_100))+geom_histogram(aes(x=pearson_fdr))+ggtitle("Pearson correlation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

length(which(test_coepimutation_results_gens25_100$pearson_fdr<0.1 & as.numeric(as.character(test_coepimutation_results_gens25_100$pearson_r>0.75))))
## Warning in Ops.factor(test_coepimutation_results_gens25_100$pearson_r, 0.75):
## '>' not meaningful for factors
## [1] 0
#correlation sim test
test_coepimutation_results_gens25_100$corrsim_pvalue<-as.numeric(as.character(test_coepimutation_results_gens25_100$corrsim_pvalue))
test_coepimutation_results_gens25_100$corrsim_fdr<-p.adjust(test_coepimutation_results_gens25_100$corrsim_pvalue,method = "fdr")
ggplot(data.frame(test_coepimutation_results_gens25_100))+geom_histogram(aes(x=corrsim_fdr))+ggtitle("correlation simulation test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

length(which(test_coepimutation_results_gens25_100$corrsim_fdr<0.1))
## [1] 22
length(which(test_coepimutation_results_gens25_100$corrsim_fdr<0.2))
## [1] 24
test_coepimutation_results_gens25_100
##        gene1 gene1_up      gene2 gene2_up matching.states total.samples
## 1  T23G11.11       11   T23G11.1       11              17            23
## 2    B0511.4        2    B0511.3        1              22            23
## 3    B0511.3        1   B0511.11        1              21            23
## 4   K11D2.1b       14  K11D2.2.1        7              14            23
## 5    F32A7.6        1    F32A7.7        5              19            23
## 6   C16C8.20        8   C16C8.21       11              20            23
## 7   C30G12.2        1  C30G12.6a        1              21            23
## 8  Y53F4B.45        8  Y53F4B.16        1              14            23
## 9  Y53F4B.16        1  Y53F4B.17        5              19            23
## 10   T12B5.2        4    T12B5.1        7              20            23
## 11   D2024.1        1   D2024.18        3              21            23
## 12  D2024.18        3    C06E4.5        3              23            23
## 13   F42G8.5        2 F42G8.4b.1       19               6            23
## 14   C07G1.6        1    C07G1.7        2              22            23
## 15  C27D8.3b       16    C27D8.2        2               5            23
## 16   C27D8.2        2    C27D8.1        1              22            23
## 17   C27D8.1        1    T23F6.1        1              23            23
## 18 Y62E10A.3        8 Y62E10A.4a       13              18            23
## 19  T05C3.6a        3    T05C3.2        3              21            23
## 20  W01A11.6        4 W01A11.7.2        1              20            23
## 21   ZC190.7        1    ZC190.8        3              21            23
## 22   F47B8.6        1    F47B8.8        1              23            23
## 23   F47B8.8        1    F47B8.7        1              23            23
## 24  Y17D7B.6        2   Y17D7B.8        1              22            23
## 25   F55C9.7        3   F55C9.14        1              21            23
## 26  F55C9.14        1    F55C9.8        8              16            23
## 27   F55C9.8        8   F55C9.13        1              16            23
## 28  Y43F8B.9        1  Y43F8B.22        9              15            23
## 29 F54F7.6.2        6    F54F7.7        1              16            23
## 30 C49F5.1.3        2 VW06B3R.1b        2              19            23
##    sim_pvalue           pearson_r pearson_pvalue corrsim_pvalue   sim_fdr
## 1      0.0306   0.653901326740574   7.137350e-04         0.0007 0.1493333
## 2      0.0870   0.976185163336012   2.033724e-15         0.0001 0.2088462
## 3      1.0001  0.0291055435365434   8.951199e-01         0.1920 1.0000000
## 4      0.1246   0.488398602698691   1.805119e-02         0.0088 0.2329412
## 5      0.2162   0.345692067343827   1.061611e-01         0.0729 0.3243000
## 6      0.0005   0.922617632780601   3.793254e-10         0.0001 0.0075000
## 7      1.0001 -0.0130343903097682   9.529306e-01         0.4116 1.0000000
## 8      1.0001   0.144694828745383   5.100793e-01         0.2561 1.0000000
## 9      0.2147   0.815735686700017   2.100078e-06         0.0001 0.3243000
## 10     0.0044   0.783306203303661   9.880117e-06         0.0002 0.0330000
## 11     0.1299    0.97215343078085   1.032325e-14         0.0001 0.2329412
## 12     0.0003   0.831599459383295   8.790134e-07         0.0001 0.0075000
## 13     0.6790   0.423113986937124   4.425741e-02         0.0309 0.8487500
## 14     0.0905   0.920171483283003   5.201921e-10         0.0001 0.2088462
## 15     1.0001  -0.357614122537788   9.386684e-02         0.9589 1.0000000
## 16     0.0876   0.901500907420391   4.345013e-09         0.0002 0.2088462
## 17     0.0435   0.978700452025632   6.368905e-16         0.0018 0.1493333
## 18     0.0022   0.334402315604938   1.188563e-01         0.0658 0.0220000
## 19     0.0343     0.8696881250749   7.100788e-08         0.0006 0.1493333
## 20     0.1742   0.149010384942001   4.974038e-01         0.0797 0.2903333
## 21     0.1267   0.982024513534572   1.088019e-16         0.0001 0.2329412
## 22     0.0440    0.99999707169656   1.976699e-56         0.0005 0.1493333
## 23     0.0448   0.999992495941199   3.864323e-52         0.0415 0.1493333
## 24     0.0838   0.995851393322125   2.380291e-23         0.0001 0.2088462
## 25     0.1320   0.754377638386002   3.204956e-05         0.0209 0.2329412
## 26     0.3500   0.874561562077928   4.866332e-08         0.0004 0.4830000
## 27     0.3542    0.83844232277318   5.870626e-07         0.0062 0.4830000
## 28     0.3847   0.200588735287752   3.587661e-01         0.1565 0.5017826
## 29     1.0001   -0.32188404806676   1.341828e-01         0.9552 1.0000000
## 30     1.0001  -0.154456695369151   4.816321e-01         0.7940 1.0000000
##     pearson_fdr corrsim_fdr
## 1  1.189558e-03 0.001500000
## 2  1.016862e-14 0.000375000
## 3  9.259861e-01 0.230400000
## 4  2.850188e-02 0.015529412
## 5  1.447651e-01 0.099409091
## 6  1.422470e-09 0.000375000
## 7  9.529306e-01 0.457333333
## 8  5.465136e-01 0.295500000
## 9  4.200157e-06 0.000375000
## 10 1.852522e-05 0.000600000
## 11 4.424251e-14 0.000375000
## 12 1.883600e-06 0.000375000
## 13 6.638611e-02 0.048789474
## 14 1.733974e-09 0.000375000
## 15 1.340955e-01 0.958900000
## 16 1.303504e-08 0.000600000
## 17 3.821343e-15 0.003600000
## 18 1.550299e-01 0.094000000
## 19 1.775197e-07 0.001384615
## 20 5.465136e-01 0.103956522
## 21 8.160140e-16 0.000375000
## 22 5.930096e-55 0.001250000
## 23 5.796484e-51 0.062250000
## 24 2.380291e-22 0.000375000
## 25 5.655804e-05 0.034833333
## 26 1.327181e-07 0.001090909
## 27 1.354760e-06 0.011625000
## 28 4.305193e-01 0.195625000
## 29 1.677285e-01 0.958900000
## 30 5.465136e-01 0.850714286
toplot<-test_coepimutation_results_gens25_100[which(as.numeric(as.character(test_coepimutation_results_gens25_100$pearson_r))>0.75),c("gene1","gene2")]
ticks<-colnames(ttg_normcounts)


for (row in seq(nrow(toplot))){
  
  gene1<-as.character(toplot[row,"gene1"]); gene2<-as.character(toplot[row,"gene2"])

  print(paste(gene1,gene2,sep=" - "))
  
  pdf(paste(paste(paste("correlated_epimutations_examples/",gene1,sep=""),gene2,sep="_vs_"),"pdf",sep="."))
  plot(as.numeric(ttg_normcounts[gene1,25:47]),as.numeric(ttg_normcounts[gene2,25:47]),xlab=paste("22G-RNA normalized counts in ",gene1,sep=""),
       ylab=paste("22G-RNA normalized counts in ",gene2,sep=""),pch=16)
  dev.off()
  
plot(as.numeric(ttg_normcounts[gene1,25:47]),as.numeric(ttg_normcounts[gene2,25:47]),xlab=paste("22G-RNA normalized counts in ",gene1,sep=""),
       ylab=paste("22G-RNA normalized counts in ",gene2,sep=""),pch=16)
  
  pdf(paste(paste("correlated_epimutations_examples/",gene1,sep=""),"pdf",sep="."))
  plot(1:23,c(as.numeric(ttg_normcounts[gene1,36]),as.numeric(ttg_normcounts[gene1,25:35]),as.numeric(ttg_normcounts[gene1,37:47])),main=gene1,
       xaxt="n",ylab="normalized 22G-RNA counts",xlab="line",pch=16)
  axis(1, at=1:23, labels=ticks[c(36,25:35,37:47)],las=2)
  dev.off()
  
plot(1:23,c(as.numeric(ttg_normcounts[gene1,36]),as.numeric(ttg_normcounts[gene1,25:35]),as.numeric(ttg_normcounts[gene1,37:47])),main=gene1,
       xaxt="n",ylab="normalized 22G-RNA counts",xlab="line",pch=16)
  axis(1, at=1:23, labels=ticks[c(36,25:35,37:47)],las=2)

  
  pdf(paste(paste("correlated_epimutations_examples/",gene2,sep=""),"pdf",sep="."))
  plot(1:23,c(as.numeric(ttg_normcounts[gene2,36]),as.numeric(ttg_normcounts[gene2,25:35]),as.numeric(ttg_normcounts[gene2,37:47])),col="red",main=gene2,
       xaxt="n",ylab="normalized 22G-RNA counts",xlab="line",pch=16)
  axis(1, at=1:23, labels=ticks[c(36,25:35,37:47)],las=2)
  dev.off()
  
plot(1:23,c(as.numeric(ttg_normcounts[gene2,36]),as.numeric(ttg_normcounts[gene2,25:35]),as.numeric(ttg_normcounts[gene2,37:47])),col="red",main=gene2,
       xaxt="n",ylab="normalized 22G-RNA counts",xlab="line",pch=16)
       axis(1, at=1:23, labels=ticks[c(36,25:35,37:47)],las=2)

}
## [1] "B0511.4 - B0511.3"

## [1] "C16C8.20 - C16C8.21"

## [1] "Y53F4B.16 - Y53F4B.17"

## [1] "T12B5.2 - T12B5.1"

## [1] "D2024.1 - D2024.18"

## [1] "D2024.18 - C06E4.5"

## [1] "C07G1.6 - C07G1.7"

## [1] "C27D8.2 - C27D8.1"

## [1] "C27D8.1 - T23F6.1"

## [1] "T05C3.6a - T05C3.2"

## [1] "ZC190.7 - ZC190.8"

## [1] "F47B8.6 - F47B8.8"

## [1] "F47B8.8 - F47B8.7"

## [1] "Y17D7B.6 - Y17D7B.8"

## [1] "F55C9.7 - F55C9.14"

## [1] "F55C9.14 - F55C9.8"

## [1] "F55C9.8 - F55C9.13"

I checked whether these pairs tend to be in the same operon, and only two pairs out of 30 were - (F42G8.5-F42G8.4 and W01A11.6-W01A11.7) - and these two pairs have low 22G-RNA correlations across samples (0.17 and 0.21). So there is little evidence of spreading of silencing through operons in this dataset. In fact, most operons are enriched in active chromatin domains that tend to be devoid of epimutations. So perhaps the clustering we see is due to chromatin-based spreading.